Version: 1.0 (Jupytext, time measurements, logger)
Please put your comments about the notebook functionality here.
import sys
import os
sys.path+=[os.path.join(os.getcwd(), ".."), os.path.join(os.getcwd(), "../..")] # one and two up
ToC
Necessary libraries for notebook functionality:
NOTE: This way, using the function, the button works only in active notebook. If the functionality needs to be preserved in html export, then the code has to be incluced directly into notebook.
from src.utils.notebook_support_functions import create_button, get_notebook_name
from src.utils.logger import Logger
from src.utils.envs import Envs
from src.utils.config import Config
from pandas import options
from IPython.display import display, HTML
Constants for overall behaviour.
LOGGER_CONFIG_NAME = "logger_file_console" # default
PYTHON_CONFIG_NAME = "python_personal" # default
CREATE_BUTTON = False
ADDAPT_WIDTH = False
NOTEBOOK_NAME = get_notebook_name()
options.display.max_rows = 500
options.display.max_columns = 500
envs = Envs()
envs.set_logger(LOGGER_CONFIG_NAME)
envs.set_config(PYTHON_CONFIG_NAME)
Logger().start_timer(f"NOTEBOOK; Notebook name: {NOTEBOOK_NAME}")
if CREATE_BUTTON:
create_button()
if ADDAPT_WIDTH:
display(HTML("<style>.container { width:100% !important; }</style>")) # notebook width
A: ../../configurations\logger_file_console.conf 2023-11-28 13:04:42,439 - git.util - DEBUG - Failed checking if running in CYGWIN due to: FileNotFoundError(2, 'The system cannot find the file specified', None, 2, None) 2023-11-28 13:04:42,442 - file_console - DEBUG - Logger was created on WS-3000 in branche 014_update_repository. 2023-11-28 13:04:42,442 - file_console - DEBUG - Process: NOTEBOOK; Notebook name: machine_learning_models_documentation.py; Timer started;
from typing import Any
from numpy import float32, array, ndarray, dtype, double
from collections import Counter, OrderedDict
from pandas import Series
from src.data.income_weather_data_generator import IncomeWeatherDataGenerator
from src.data.splitter import Splitter
from src.models.ml_r_mean_model import MeanModel
from src.models.ml_r_stats_lin_reg_model import StatsLinRegModel
from src.models.ml_c_max_entropy_classification_random_model import MaxEntropyClassificationRandomModel
from src.models.ml_c_distribution_classification_random_model import DistributionClassificationRandomModel
from src.visualisations.plotly_line_chart import PlotlyLineChart
from src.visualisations.plotly_histogram import PlotlyHistogram
from src.visualisations.plotly_histogram_multi import PlotlyHistogramMulti
from src.visualisations.plotly_bar_chart import PlotlyBarChart
# from src.global_constants import * # Remember to import only the constants in use
N_ROWS_TO_DISPLAY = 2
FIGURE_SIZE_SETTING = {"autosize": False, "width": 2200, "height": 750}
DATA_PROCESSING_CONFIG_NAME = "data_processing_basic"
splitter = Splitter()
line_chart = PlotlyLineChart()
hist = PlotlyHistogram()
hist_multi = PlotlyHistogram()
bar_chart = PlotlyBarChart()
(X_train, X_test), (Y_train_reg, Y_test_reg), (Y_train_bin, Y_test_bin), (Y_train_ter, Y_test_ter), \
(Y_train_ter_oh, Y_test_ter_oh) = IncomeWeatherDataGenerator().generate_basic_ml_data()
print(X_train.shape)
print(Y_train_reg.shape)
print(Y_train_bin.shape)
print(Y_train_ter.shape)
print(Y_train_ter_oh.shape)
print("\n")
print(X_test.shape)
print(Y_test_reg.shape)
print(Y_test_bin.shape)
print(Y_test_ter.shape)
print(Y_test_ter_oh.shape)
(5479, 12) (5479, 1) (5479, 1) (5479, 1) (5479, 3) (1827, 12) (1827, 1) (1827, 1) (1827, 1) (1827, 3)
def plot_histograms_and_residuals(train_res: ndarray[Any, dtype[double]], test_res: ndarray[Any, dtype[double]]) -> None:
hist.plot(data=train_res.reshape(train_res.shape[0],), plot_title="Train Residuals", x_title="Y-Y hat")
hist.plot(data=test_res.reshape(test_res.shape[0],), plot_title="Test Residuals", x_title="Y-Y hat")
line_chart.plot(
lines=[(array(range(train_res.shape[0])), train_res.reshape((train_res.shape[0],)))],
plot_title="Train Residuals",
x_title="Index",
y_title="Y-Y_hat"
)
line_chart.plot(
lines=[(array(range(test_res.shape[0])), test_res.reshape((test_res.shape[0],)))],
plot_title="Test Residuals",
x_title="Index",
y_title="Y-Y_hat"
)
model = MeanModel()
model_params = {}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_reg, model_params=model_params)
test_predictions = model.predict(X=X_test)
train_residuals = model.get_residuals(X=X_train, Y=Y_train_reg)
test_residuals = model.get_residuals(X=X_test, Y=Y_test_reg)
print(train_predictions[0:5])
print(test_predictions[0:5])
print(f"Train R2 score: {model.get_r2_score(X=X_train, Y=Y_train_reg)}")
print(f"Test R2 score: {model.get_r2_score(X=X_test, Y=Y_test_reg)}")
plot_histograms_and_residuals(train_residuals, test_residuals)
[[681.63153] [681.63153] [681.63153] [681.63153] [681.63153]] [[681.63153] [681.63153] [681.63153] [681.63153] [681.63153]] Train R2 score: -9.547918011776346e-15 Test R2 score: -1.3296558846187523e-05
model = StatsLinRegModel()
model_params = {"intercept": True}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_reg, model_params=model_params)
test_predictions = model.predict(X=X_test)
train_residuals = model.get_residuals(X=X_train, Y=Y_train_reg)
test_residuals = model.get_residuals(X=X_test, Y=Y_test_reg)
print(train_predictions[0:5])
print(test_predictions[0:5])
print(f"Train R2 score: {model.get_r2_score(X=X_train, Y=Y_train_reg)}")
print(f"Test R2 score: {model.get_r2_score(X=X_test, Y=Y_test_reg)}")
plot_histograms_and_residuals(train_residuals, test_residuals)
[[ 422.59083306] [ 938.58093491] [ 709.94520638] [1058.99726787] [ 787.98741477]] [[580.29172711] [424.30054184] [557.23010679] [369.15790337] [542.98611909]] Train R2 score: 0.9978863195039516 Test R2 score: 0.9977919431102703
print(f"Stat, p-value: {model.do_normality_test_for_residuals(X_train, Y_train_reg, 'ks')}")
print(f"Stat, p-value: {model.do_normality_test_for_residuals(X_test, Y_test_reg, 'ks')}")
Stat, p-value: (0.4008106515966553, 0.0) Stat, p-value: (0.40561743158116426, 4.268010175003165e-272)
print(f"Stat, p-value: {model.do_normality_test_for_residuals(X_train, Y_train_reg, 'sw')}")
print(f"Stat, p-value: {model.do_normality_test_for_residuals(X_test, Y_test_reg, 'sw')}")
Stat, p-value: (0.9995805621147156, 0.2908723056316376) Stat, p-value: (0.9985916018486023, 0.13742749392986298)
D:\ds_ml_template\.venv\lib\site-packages\scipy\stats\_morestats.py:1816: UserWarning: p-value may not be accurate for N > 5000.
lr_model = model.get_model()
model_stats = lr_model.params
model_stats
array([ 5.45523758, 30.01484283, 0.72797686, -0.05753403,
2.88036519, 1.84085437, 5.51933844, -1.52137758,
-3.93438568, -4.65969029, -14.6032986 , 19.93986449,
4.77836198])
lr_model.summary()
| Dep. Variable: | y | R-squared: | 0.998 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.998 |
| Method: | Least Squares | F-statistic: | 2.581e+05 |
| Date: | Tue, 28 Nov 2023 | Prob (F-statistic): | 0.00 |
| Time: | 13:04:43 | Log-Likelihood: | -20370. |
| No. Observations: | 5479 | AIC: | 4.076e+04 |
| Df Residuals: | 5468 | BIC: | 4.083e+04 |
| Df Model: | 10 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.4552 | 0.317 | 17.203 | 0.000 | 4.834 | 6.077 |
| x1 | 30.0148 | 0.019 | 1602.599 | 0.000 | 29.978 | 30.052 |
| x2 | 0.7280 | 0.332 | 2.192 | 0.028 | 0.077 | 1.379 |
| x3 | -0.0575 | 0.333 | -0.173 | 0.863 | -0.711 | 0.596 |
| x4 | 2.8804 | 0.334 | 8.613 | 0.000 | 2.225 | 3.536 |
| x5 | 1.8409 | 0.333 | 5.527 | 0.000 | 1.188 | 2.494 |
| x6 | 5.5193 | 0.334 | 16.533 | 0.000 | 4.865 | 6.174 |
| x7 | -1.5214 | 0.333 | -4.562 | 0.000 | -2.175 | -0.868 |
| x8 | -3.9344 | 0.332 | -11.837 | 0.000 | -4.586 | -3.283 |
| x9 | -4.6597 | 0.248 | -18.774 | 0.000 | -5.146 | -4.173 |
| x10 | -14.6033 | 0.246 | -59.436 | 0.000 | -15.085 | -14.122 |
| x11 | 19.9399 | 0.247 | 80.630 | 0.000 | 19.455 | 20.425 |
| x12 | 4.7784 | 0.245 | 19.485 | 0.000 | 4.298 | 5.259 |
| Omnibus: | 3.326 | Durbin-Watson: | 2.036 |
|---|---|---|---|
| Prob(Omnibus): | 0.190 | Jarque-Bera (JB): | 3.445 |
| Skew: | 0.025 | Prob(JB): | 0.179 |
| Kurtosis: | 3.112 | Cond. No. | 1.73e+17 |
summary = lr_model.summary()
summary.tables[0]
| Dep. Variable: | y | R-squared: | 0.998 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.998 |
| Method: | Least Squares | F-statistic: | 2.581e+05 |
| Date: | Tue, 28 Nov 2023 | Prob (F-statistic): | 0.00 |
| Time: | 13:04:43 | Log-Likelihood: | -20370. |
| No. Observations: | 5479 | AIC: | 4.076e+04 |
| Df Residuals: | 5468 | BIC: | 4.083e+04 |
| Df Model: | 10 | ||
| Covariance Type: | nonrobust |
summary.tables[0].data
[['Dep. Variable:', 'y', ' R-squared: ', ' 0.998'], ['Model:', 'OLS', ' Adj. R-squared: ', ' 0.998'], ['Method:', 'Least Squares', ' F-statistic: ', '2.581e+05'], ['Date:', 'Tue, 28 Nov 2023', ' Prob (F-statistic):', ' 0.00'], ['Time:', '13:04:43', ' Log-Likelihood: ', ' -20370.'], ['No. Observations:', ' 5479', ' AIC: ', '4.076e+04'], ['Df Residuals:', ' 5468', ' BIC: ', '4.083e+04'], ['Df Model:', ' 10', ' ', ' '], ['Covariance Type:', 'nonrobust', ' ', ' ']]
summary.tables[1]
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 5.4552 | 0.317 | 17.203 | 0.000 | 4.834 | 6.077 |
| x1 | 30.0148 | 0.019 | 1602.599 | 0.000 | 29.978 | 30.052 |
| x2 | 0.7280 | 0.332 | 2.192 | 0.028 | 0.077 | 1.379 |
| x3 | -0.0575 | 0.333 | -0.173 | 0.863 | -0.711 | 0.596 |
| x4 | 2.8804 | 0.334 | 8.613 | 0.000 | 2.225 | 3.536 |
| x5 | 1.8409 | 0.333 | 5.527 | 0.000 | 1.188 | 2.494 |
| x6 | 5.5193 | 0.334 | 16.533 | 0.000 | 4.865 | 6.174 |
| x7 | -1.5214 | 0.333 | -4.562 | 0.000 | -2.175 | -0.868 |
| x8 | -3.9344 | 0.332 | -11.837 | 0.000 | -4.586 | -3.283 |
| x9 | -4.6597 | 0.248 | -18.774 | 0.000 | -5.146 | -4.173 |
| x10 | -14.6033 | 0.246 | -59.436 | 0.000 | -15.085 | -14.122 |
| x11 | 19.9399 | 0.247 | 80.630 | 0.000 | 19.455 | 20.425 |
| x12 | 4.7784 | 0.245 | 19.485 | 0.000 | 4.298 | 5.259 |
summary.tables[2]
| Omnibus: | 3.326 | Durbin-Watson: | 2.036 |
|---|---|---|---|
| Prob(Omnibus): | 0.190 | Jarque-Bera (JB): | 3.445 |
| Skew: | 0.025 | Prob(JB): | 0.179 |
| Kurtosis: | 3.112 | Cond. No. | 1.73e+17 |
print(lr_model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.998
Model: OLS Adj. R-squared: 0.998
Method: Least Squares F-statistic: 2.581e+05
Date: Tue, 28 Nov 2023 Prob (F-statistic): 0.00
Time: 13:04:43 Log-Likelihood: -20370.
No. Observations: 5479 AIC: 4.076e+04
Df Residuals: 5468 BIC: 4.083e+04
Df Model: 10
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 5.4552 0.317 17.203 0.000 4.834 6.077
x1 30.0148 0.019 1602.599 0.000 29.978 30.052
x2 0.7280 0.332 2.192 0.028 0.077 1.379
x3 -0.0575 0.333 -0.173 0.863 -0.711 0.596
x4 2.8804 0.334 8.613 0.000 2.225 3.536
x5 1.8409 0.333 5.527 0.000 1.188 2.494
x6 5.5193 0.334 16.533 0.000 4.865 6.174
x7 -1.5214 0.333 -4.562 0.000 -2.175 -0.868
x8 -3.9344 0.332 -11.837 0.000 -4.586 -3.283
x9 -4.6597 0.248 -18.774 0.000 -5.146 -4.173
x10 -14.6033 0.246 -59.436 0.000 -15.085 -14.122
x11 19.9399 0.247 80.630 0.000 19.455 20.425
x12 4.7784 0.245 19.485 0.000 4.298 5.259
==============================================================================
Omnibus: 3.326 Durbin-Watson: 2.036
Prob(Omnibus): 0.190 Jarque-Bera (JB): 3.445
Skew: 0.025 Prob(JB): 0.179
Kurtosis: 3.112 Cond. No. 1.73e+17
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.02e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
def get_ordered_classes_frequency(Y: ndarray[Any, dtype[double]]) -> OrderedDict:
"""
Gets the frequncy of classes numbered order buy classe ids.
:param Y: ndarray[Any, dtype[double]].
"""
d = dict(Counter(Y.reshape((Y.shape[0],))))
return OrderedDict(sorted(d.items()))
model = MaxEntropyClassificationRandomModel()
model_params = {"oh": False}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_ter, model_params=model_params)
test_predictions = model.predict(X=X_test)
print(train_predictions[0:N_ROWS_TO_DISPLAY])
print(test_predictions[0:N_ROWS_TO_DISPLAY])
[[2.] [0.]] [[2.] [0.]]
print(get_ordered_classes_frequency(Y_train_ter))
print(get_ordered_classes_frequency(model.predict(X_train)))
OrderedDict([(0, 1411), (1, 2967), (2, 1101)]) OrderedDict([(0.0, 1841), (1.0, 1813), (2.0, 1825)])
freq = get_ordered_classes_frequency(Y_train_ter)
bar_chart.plot(
array_ids=array(list(freq.keys())),
array_values=array(list(freq.values())),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
freq = get_ordered_classes_frequency(model.predict(X_train))
bar_chart.plot(
array_ids=array(list(freq.keys())),
array_values=array(list(freq.values())),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
model = MaxEntropyClassificationRandomModel()
model_params = {"oh": True}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_ter_oh, model_params=model_params)
test_predictions = model.predict(X=X_test)
print(train_predictions[0:N_ROWS_TO_DISPLAY])
print(test_predictions[0:N_ROWS_TO_DISPLAY])
[[0. 0. 1.] [1. 0. 0.]] [[0. 0. 1.] [1. 0. 0.]]
print(Y_train_ter_oh.sum(axis=0))
print(train_predictions.sum(axis=0))
[1411. 2967. 1101.] [1841. 1813. 1825.]
freq = Y_train_ter_oh.sum(axis=0)
bar_chart.plot(
array_ids=array(list(range(Y_train_ter_oh.shape[1]))),
array_values=array(freq),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
freq = model.predict(X_train).sum(axis=0)
bar_chart.plot(
array_ids=array(list(range(Y_train_ter_oh.shape[1]))),
array_values=array(freq),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
model = DistributionClassificationRandomModel()
model_params = {"oh": False}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_ter, model_params=model_params)
test_predictions = model.predict(X=X_test)
print(train_predictions[0:N_ROWS_TO_DISPLAY])
print(test_predictions[0:N_ROWS_TO_DISPLAY])
[[0.] [0.]] [[0.] [0.]]
print(get_ordered_classes_frequency(Y_train_ter))
print(get_ordered_classes_frequency(model.predict(X_train)))
OrderedDict([(0, 1411), (1, 2967), (2, 1101)]) OrderedDict([(0.0, 1403), (1.0, 2979), (2.0, 1097)])
freq = get_ordered_classes_frequency(Y_train_ter)
bar_chart.plot(
array_ids=array(list(freq.keys())),
array_values=array(list(freq.values())),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
freq = get_ordered_classes_frequency(model.predict(X_train))
bar_chart.plot(
array_ids=array(list(freq.keys())),
array_values=array(list(freq.values())),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
model = DistributionClassificationRandomModel()
model_params = {"oh": True}
train_predictions = model.fit_predict(X=X_train, Y=Y_train_ter_oh, model_params=model_params)
test_predictions = model.predict(X=X_test)
print(train_predictions[0:N_ROWS_TO_DISPLAY])
print(test_predictions[0:N_ROWS_TO_DISPLAY])
[[1. 0. 0.] [1. 0. 0.]] [[1. 0. 0.] [1. 0. 0.]]
print(Y_train_ter_oh.sum(axis=0))
print(train_predictions.sum(axis=0))
[1411. 2967. 1101.] [1403. 2979. 1097.]
freq = Y_train_ter_oh.sum(axis=0)
bar_chart.plot(
array_ids=array(list(range(Y_train_ter_oh.shape[1]))),
array_values=array(freq),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
freq = model.predict(X_train).sum(axis=0)
bar_chart.plot(
array_ids=array(list(range(Y_train_ter_oh.shape[1]))),
array_values=array(freq),
plot_title="",
name_ids="",
name_values="",
order_by_values=False,
reverse=False
)
Logger().end_timer()
2023-11-28 13:04:44,807 - file_console - DEBUG - Process: NOTEBOOK; Notebook name: machine_learning_models_documentation.py; Timer ended; Process Duration [s]: 2.36; Process Duration [m]: 0.04